# load raw data files
# data <- read.csv("../data/filledDatabase.csv")[,-c(2:9,11:13)]
data <- read.csv("../data/filledDatabaseNUMONLY_042620.csv")

# clean data 
data <- clean_data(data) %>% collapse_data()

# separate compound and group_cate from the predictors
compound <- data$Compound
group_cat <- data$GroupCat

# prepare date for modeling
data <- select(data, -c("Compound"))
# data_pca <- get_pc_space(data[,-1], k = 13) %>% scale() %>% data.frame()

# split data into 5 folds for cross validation later
folds <- caret::createFolds(1:nrow(data), k = 5, list = TRUE, returnTrain = FALSE)

Multinomial Regression

library(glmnet)
X = data[,-1] %>% as.matrix()
Y = data$GroupCat %>% as.matrix()

Shrinkage

Ridge

model_ridge <- glmnet(x = X, y = Y, alpha = 0, family = "multinomial")
plot(model_ridge, xvar = "lambda", label = TRUE)

LASSO

model_lasso <- glmnet(x = X, y = Y, alpha = 1, family = "multinomial")
plot(model_lasso, xvar = "lambda", label = TRUE)

Coefficient

Ridge

ridge_cv <- cv.glmnet(x = X, y = Y, alpha = 0, nfolds = 5, type.measure = "deviance", family = "multinomial") 
ridge_cv %>% 
  get_coef(tuning_parameter = ridge_cv$lambda.min) %>% 
  select(feature, Cubic, Tilted, Hexagonal, Others) %>% 
  filter(feature != "(Intercept)") %>% 
  plot_coef()

LASSO

lasso_cv <- cv.glmnet(x = X, y = Y, alpha = 1, nfolds = 5, type.measure = "deviance", family = "multinomial")
lasso_cv %>% 
  get_coef(tuning_parameter = lasso_cv$lambda.min) %>% 
  select(feature, Cubic, Tilted, Hexagonal, Others) %>% 
  filter(feature != "(Intercept)") %>% 
  plot_coef()

Elastic Net

library(caret)
elastic_cv <- 
  train(GroupCat ~., data = data, method = "glmnet",
    trControl = trainControl("cv", number = 5),
    tuneLength = 10
    )
elastic_cv$finalModel %>% 
  get_coef(tuning_parameter = elastic_cv$bestTune$lambda) %>% 
  select(feature, Cubic, Tilted, Others) %>% 
  filter(feature != "(Intercept)") %>% 
  plot_coef()

Accurate classification rate

Ridge

tb_ridge = prediction_table(alpha = 0, lambda = ridge_cv$lambda.min) 
tb_ridge$r %>% print_accurate_tb()
Fold1 Fold2 Fold3 Fold4 Fold5 Mean
0.6904762 0.702381 0.7471264 0.8214286 0.7176471 0.7358118
tb_ridge$t %>% highlight_tb_count()
Cubic Tilted Hexagonal Others
Cubic 71 14 17 7
Tilted 26 186 2 24
Hexagonal 6 0 30 3
Others 4 9 0 25
Total 107 209 49 59
tb_ridge$t %>% highlight_tb_percent()
Cubic Tilted Hexagonal Others
Cubic 0.66 0.07 0.35 0.12
Tilted 0.24 0.89 0.04 0.41
Hexagonal 0.06 0 0.61 0.05
Others 0.04 0.04 0 0.42
Total 100% 100% 100% 100%

LASSO

tb_lasso = prediction_table(alpha = 1, lambda = lasso_cv$lambda.min) 
tb_lasso$r %>% print_accurate_tb()
Fold1 Fold2 Fold3 Fold4 Fold5 Mean
0.7142857 0.7261905 0.7356322 0.8214286 0.7294118 0.7453897
tb_lasso$t %>% highlight_tb_count() 
Cubic Tilted Hexagonal Others
Cubic 74 14 14 7
Tilted 23 186 3 23
Hexagonal 5 0 31 4
Others 5 9 1 25
Total 107 209 49 59
tb_lasso$t %>% highlight_tb_percent()
Cubic Tilted Hexagonal Others
Cubic 0.69 0.07 0.29 0.12
Tilted 0.21 0.89 0.06 0.39
Hexagonal 0.05 0 0.63 0.07
Others 0.05 0.04 0.02 0.42
Total 100% 100% 100% 100%

Elastic Net

tb_elastic = prediction_table(alpha = elastic_cv$bestTune[[1]], lambda = elastic_cv$bestTune[[2]]) 
tb_elastic$r %>% print_accurate_tb()
Fold1 Fold2 Fold3 Fold4 Fold5 Mean
0.7142857 0.7142857 0.7471264 0.797619 0.7411765 0.7428987
tb_elastic$t %>% highlight_tb_count() 
Cubic Tilted Hexagonal Others
Cubic 73 16 13 11
Tilted 23 180 2 18
Hexagonal 6 1 34 2
Others 5 12 0 28
Total 107 209 49 59
tb_elastic$t %>% highlight_tb_percent()
Cubic Tilted Hexagonal Others
Cubic 0.68 0.08 0.27 0.19
Tilted 0.21 0.86 0.04 0.31
Hexagonal 0.06 0 0.69 0.03
Others 0.05 0.06 0 0.47
Total 100% 100% 100% 100%